La idea de este taller es manipular archivos (leerlos, parsearlos y escribirlos) y hacer lo mismo con bases de datos estructuradas.
In [1]:
import pandas as pd
df = pd.read_csv('datoss.tsv', sep='\t')
In [ ]:
df [1:1]
La base de datos se compone de 37 columnas (Variables). En términos generales los datos exponen la asociación genética de un tipo de patología (eg. Cáncer, Asma) y la información "detallada" sobre el estudio que determinó dicha relación.
Esta base de datos permite almacenar de forma ordenada la relación entre rasgos genéticos y fenotípicos. Los estudios contenidos en esta base de datos tienen como intención descubrir la clave para prevenir, diagnosticar y tratar una enfermedad. Esto, también es conocido como estudio de asociación de genoma completo.
Qué Entidades (tablas) puede definir?
Cree la base de datos (copie el código SQL que se usó)
In [8]:
CREATE TABLE journal
(
id_JOURNAL int auto_increment PRIMARY KEY,
namejournal varchar (300)
);
CREATE TABLE platform
(
id_platform INT auto_increment NOT NULL PRIMARY KEY,
PLATFORMname VARCHAR(300)
);
CREATE TABLE study
(
id_STUDY int auto_increment PRIMARY KEY,
STUDY text,
INITIAL_SAMPLE_SIZE int,
REPLICATION_SAMPLE_SIZE int,
id_platform int,
foreign key (id_platform) references platform(id_platform),
P_VALUE int,
PVALUE_MLOG int,
PVALUE_TEXT varchar (300),
CI_text varchar (300),
OR_BETA int,
MAPPED_TRAIT varchar (300),
MAPPED_TRAIT_URI varchar (300)
);
CREATE TABLE publicacion
(
id_publicacion int auto_increment PRIMARY KEY,
PUBMEDID varchar (300),
FIRSTAUTHOR varchar (300),
id_journal int,
foreign key (id_JOURNAL) references journal(id_JOURNAL),
LINK varchar (300),
STUDY_ACCESSION varchar (300),
id_STUDY int,
foreign key (id_STUDY) references study(id_STUDY)
);
CREATE TABLE enfermedad
(
id_enfermedad int auto_increment PRIMARY KEY,
DISEASETRAITenfermedad VARCHAR(300)
);
CREATE TABLE loci
(
id_loci int auto_increment PRIMARY KEY,
REGION text,
CHR_ID text,
CHR_POS text,
REPORTED_GENE text,
MAPPED_GENE text,
UPSTREAM_GENE_ID text,
DOWNSTREAM_GENE_ID text,
SNP_GENE_IDS text,
UPSTREAM_GENE_DISTANCE text,
DOWNSTREAM_GENE_DISTANCE text,
STRONGEST_SNP_RISK_ALLELE text,
SNPS text,
MERGED text,
SNP_ID_CURRENT text,
CONTEXT_ text,
INTERGENIC text
);
CREATE TABLE enfermedad_loci
(
id_enfermedad int,
id_loci int,
PRIMARY KEY (id_enfermedad, id_loci),
foreign key (id_enfermedad) references enfermedad(id_enfermedad),
foreign key (id_loci) references loci(id_loci)
);
In [ ]:
#Leer el archivo
df.head(1)
In [3]:
import mysql.connector
cnx = mysql.connector.connect(user='root', password='fnsQFJ14',
host='127.0.0.1', database='new_schema')
In [ ]:
hostname = '127.0.0.1'
username = 'root'
password = 'fnsQFJ14'
database = 'new_schema'
def doQuery( conn ) :
cur = conn.cursor()
cur.execute( "select * from platform" )
for id_nombre, nombre_plat in cur.fetchall() :
print (id_nombre, nombre_plat)
myConnection = mysql.connector.connect( host=hostname, user=username, passwd=password, db=database )
doQuery( myConnection )
myConnection.close()
In [6]:
# Guardar informacion de plataforma
def get_platformId(names_platf):
cur = myConnection.cursor()
cur.execute( """select * from platform where PLATFORM_SNPS_PASSplatformING_QC = "%s" """ % (names_platf) )
id_plat = None
for id_, nombre_plat in cur.fetchall() :
id_plat = id_
if not id_plat:
print("""insert into platform values (NULL, "%s" )""" % (names_platf))
cur.execute("""insert into platform values (NULL, "%s" )""" % (names_platf))
cur.execute("SELECT LAST_INSERT_ID()")
id_plat = cur.fetchall()[0][0]
myConnection.commit()
return id_plat
hostname = '127.0.0.1'
username = 'root'
password = 'fnsQFJ14'
database = 'new_schema'
myConnection = mysql.connector.connect( host=hostname, user=username, passwd=password, db=database )
for index, row in df.iterrows():
plat_name = row['PLATFORM [SNPS PASSING QC]']
platform_id = get_platformId(plat_name)
print()
myConnection.close()
In [9]:
# Guardar informacion de journal
def get_journalId(names_journal):
cur = myConnection.cursor()
cur.execute( """select * from journal where namejournal = "%s" """ % (names_journal) )
id_jour = None
for id_, nombre_journ in cur.fetchall() :
id_jour = id_
if not id_jour:
print("""insert into journal values (NULL, "%s" )""" % (names_journal))
cur.execute("""insert into journal values (NULL, "%s" )""" % (names_journal))
cur.execute("SELECT LAST_INSERT_ID()")
id_jour = cur.fetchall()[0][0]
myConnection.commit()
return id_jour
hostname = '127.0.0.1'
username = 'root'
password = 'fnsQFJ14'
database = 'new_schema'
myConnection = mysql.connector.connect( host=hostname, user=username, passwd=password, db=database )
for index, row in df.iterrows():
journal_name = row['JOURNAL']
journal_id = get_journalId(journal_name)
print()
myConnection.close()
In [12]:
# Guardar información de enferemdad
def get_enfermedadlId(names_enferm):
cur = myConnection.cursor()
cur.execute( """select * from enfermedad where DISEASETRAIT = "%s" """ % (names_enferm) )
id_enfer = None
for id_, id_platform in cur.fetchall() :
id_enfer = id_
if not id_enfer:
print("""insert into enfermedad values (NULL, "%s" )""" % (names_enferm))
cur.execute("""insert into enfermedad values (NULL, "%s" )""" % (names_enferm))
cur.execute("SELECT LAST_INSERT_ID()")
id_enfer = cur.fetchall()[0][0]
myConnection.commit()
return id_enfer
hostname = '127.0.0.1'
username = 'root'
password = 'fnsQFJ14'
database = 'new_schema'
myConnection = mysql.connector.connect( host=hostname, user=username, passwd=password, db=database )
for index, row in df.iterrows():
enf_name = row['DISEASE/TRAIT']
enferm_id = get_enfermedadlId(enf_name)
print()
myConnection.close()
In [ ]:
#Guardar información loci
def get_lociId(regionloci, chro, chrpos, repor, mappede, geneups, genedows, snp, upstr, downs, riskalle, snps, merged,
snpid, contexts, intergenic):
cur = myConnection.cursor()
cur.execute( """select id_loci, CHR_ID, CHR_POS from loci where CHR_ID = "%s" and CHR_POS = "%s"
""" %(chro, chrpos))
id_loci = None
for id_, chrm, pos in cur.fetchall() :
print(id_)
id_loci = id_
if not id_loci:
#print("""insert into loci values (NULL, "%s", "%s", "%s", "%s", "%s","%s", "%s", "%s", "%s", "%s", "%s","%s",
#"%s", "%s", "%s", "%s")""" % (regionloci, chro, chrpos, repor, mappede, geneups, genedows, snp, upstr, downs,
#riskalle, snps, merged, snpid, contexts, intergenic))
cur.execute("""insert into loci values (NULL, "%s", "%s", "%s", "%s", "%s","%s", "%s", "%s", "%s", "%s", "%s","%s",
"%s", "%s", "%s", "%s")""" % (regionloci, chro, chrpos, repor, mappede, geneups, genedows, snp, upstr, downs,
riskalle, snps, merged, snpid, contexts, intergenic))
cur.execute("SELECT LAST_INSERT_ID()")
id_loci = cur.fetchall()[0][0]
myConnection.commit()
return id_loci
hostname = '127.0.0.1'
username = 'root'
password = 'fnsQFJ14'
database = 'new_schema'
myConnection = mysql.connector.connect( host=hostname, user=username, passwd=password, db=database )
for index, row in df.iterrows():
#print(type(loci[0]), type(loci[1]), type(loci[2]), type(loci[3]), type(loci[4]), type(loci[5]), type(loci[6]),
#type(loci[7]),type(loci[8]), type(loci[9]), type(loci[10]), type(loci[11]), type(loci[12]),
#type(loci[13]), type(loci[14]),type(loci[15]))
loci = [row['REGION'], row['CHR_ID'], row['CHR_POS'],row['REPORTED GENE(S)'],row['MAPPED_GENE'],row['UPSTREAM_GENE_ID'],
row['DOWNSTREAM_GENE_ID'], row['SNP_GENE_IDS'],row['UPSTREAM_GENE_DISTANCE'],row['DOWNSTREAM_GENE_DISTANCE'],
row['STRONGEST SNP-RISK ALLELE'],row['SNPS'], row['MERGED'], row['SNP_ID_CURRENT'], row['CONTEXT'],
row['INTERGENIC']]
loci_id = get_lociId(loci[0], loci[1], loci[2], loci[3], loci[4], loci[5], loci[6], loci[7], loci[8], loci[9], loci[10],
loci[11], loci[12], loci[13], loci[14], loci[15])
print()
myConnection.close()
In [ ]:
#Tabla intermedia: Enfermedad_loci
hostname = '127.0.0.1'
username = 'root'
password = 'fnsQFJ14'
database = 'new_schema'
myConnection = mysql.connector.connect( host=hostname, user=username, passwd=password, db=database )
cur = myConnection.cursor()
for index, row in df.iterrows():
enfermedadAux = row['DISEASE/TRAIT']
cur.execute("""select id_enfermedad from enfermedad where DISEASETRAIT = "%s" """ % (enfermedadAux) )
idenferAux=cur.fetchall()
cur.execute("""select id_loci from loci where CHR_ID = "%s" and CHR_POS = "%s" """ % (row.CHR_ID, row.CHR_POS))
idlociAux=cur.fetchall()
#print(idenferAux[0][0],idlociAux[0][0])
cur.execute("""select * from enfermedad_loci where id_loci = "%d" and id_enfermedad = "%d" """ %(idlociAux[0][0],
idenferAux[0][0]))
idiguales = cur.fetchall()
#print(idiguales)
if not idiguales:
cur.execute("""insert into enfermedad_loci (id_enfermedad, id_loci) values ("%d", "%d")""" %(idenferAux[0][0],
idlociAux[0][0]))
myConnection.commit()
myConnection.close()
In [ ]:
#¿Cuáles genes se encuentran relacionados con el cáncer de pulmón?
#Se responde a la anterior pregunta y se expone el cromosoma en el cuál se encuentra dicho gen.
hostname = '127.0.0.1'
username = 'root'
password = 'fnsQFJ14'
database = 'new_schema'
def doQuery( conn ) :
cur = conn.cursor()
myConnection = mysql.connector.connect( host=hostname, user=username, passwd=password, db=database )
doQuery( myConnection )
#myConnection.close()
cur = myConnection.cursor()
cur.execute("""SELECT r.CHR_ID, r.REPORTED_GENE
FROM loci r
WHERE NOT EXISTS (SELECT * FROM enfermedad i
WHERE DISEASETRAIT IN ('Lung cancer')
AND NOT EXISTS
(SELECT * FROM enfermedad_loci ri
WHERE ri.id_loci = r.id_loci
AND ri.id_enfermedad = i.id_enfermedad))
""")
enferme = cur.fetchall()
print(enferme)
In [7]:
import pandas as pd
my_df = pd.DataFrame(enferme)
my_df.to_csv('output.csv', index=False, header=False)